Main
Before filtering
Summarize number of mapped reads and number of detected genes (counts > 0)
data.init$metadata <-
dplyr::inner_join(data.init$metadata,
summarize_sequencing_results(data.init$counts), by="SampleName")
Plot - Detected genes x Mapped reads
plot_detected_genes_vs_mapped_reads(
data.init$metadata, params$min.genes, params$min.reads, params$set.scale)
Initial gene filtering and normalization
data.init$counts.gf <-
filter_genes_by_cpm_per_condition(data.init$counts, data.init$metadata, min.cpm=params$min.cpm)
# Normalize across all conditions
data.init$tmm.cpm.gf <-
transform_counts_to_tmm_cpm(data.init$counts.gf, data.init$metadata, log2=FALSE)
data.init$log2.tmm.cpm.gf <-
transform_counts_to_tmm_cpm(data.init$counts.gf, data.init$metadata,
log2=TRUE, shift.min=TRUE)
Plot - PCA
Check sample clustering before removing low quality samples
data.init$pca <- run_pca(data.init$log2.tmm.cpm.gf)
By mapped reads
plot_pca_by_mapped_reads(data.init$pca, data.init$metadata, params$min.reads)
## Warning: Removed 10 rows containing missing values (geom_point).
plot_pca_by_mapped_reads_gradient(data.init$pca, data.init$metadata)
## Warning: Removed 10 rows containing missing values (geom_point).
By detected genes
plot_pca_by_detected_genes(data.init$pca, data.init$metadata, params$min.genes)
## Warning: Removed 10 rows containing missing values (geom_point).
plot_pca_by_detected_genes_gradient(data.init$pca, data.init$metadata)
## Warning: Removed 10 rows containing missing values (geom_point).
By batch
plot_pca_by_batch(data.init$pca, data.init$metadata)
## Warning: Removed 10 rows containing missing values (geom_point).
By condition
plot_pca_by_condition(data.init$pca, data.init$metadata)
## Warning: Removed 10 rows containing missing values (geom_point).
plot_pca_by_condition_pc34(data.init$pca, data.init$metadata)
Filter 1: Sequencing quality
Remove samples with less than minimum detected genes OR less than minimum mapped reads
data.qc1.seq <-
filter_samples_by_detected_genes_mapped_reads(
data.init$counts, data.init$metadata, params$min.genes, params$min.reads)
# Remove tissue-sex conditions with few replicates remaining
data.qc1.seq <-
filter_conditions_by_min_replicates(
data.qc1.seq$counts, data.qc1.seq$metadata, params$min.replicates)
## [1] "Removing conditions:"
## [1] "F[0-9][0-9]_heart" "M[0-9][0-9]_heart" "F[0-9][0-9]_muscle" "M[0-9][0-9]_muscle"
## [5] "F[0-9][0-9]_swim_bladder" "M[0-9][0-9]_swim_bladder"
data.qc1.seq$counts.gf <-
filter_genes_by_cpm_per_condition(data.qc1.seq$counts, data.qc1.seq$metadata, min.cpm=params$min.cpm)
# Normalize across all conditions
data.qc1.seq$tmm.cpm.gf <-
transform_counts_to_tmm_cpm(data.qc1.seq$counts.gf, data.qc1.seq$metadata, log2=FALSE)
data.qc1.seq$log2.tmm.cpm.gf <-
transform_counts_to_tmm_cpm(data.qc1.seq$counts.gf, data.qc1.seq$metadata,
log2=TRUE, shift.min=TRUE)
plot_detected_genes_vs_mapped_reads(data.qc1.seq$metadata, params$min.genes, params$min.reads)
Plot - PCA
data.qc1.seq$pca <- run_pca(data.qc1.seq$log2.tmm.cpm.gf)
By mapped reads
plot_pca_by_mapped_reads_gradient(data.qc1.seq$pca, data.qc1.seq$metadata)
plot_pca_by_detected_genes_gradient(data.qc1.seq$pca, data.qc1.seq$metadata)
By detected genes
plot_pca_by_detected_genes_gradient(data.qc1.seq$pca, data.qc1.seq$metadata)
By batch
plot_pca_by_batch(data.qc1.seq$pca, data.qc1.seq$metadata)
By condition
plot_pca_by_condition(data.qc1.seq$pca, data.qc1.seq$metadata)
plot_pca_by_condition_pc34(data.qc1.seq$pca, data.qc1.seq$metadata)
Plot - Correlation heatmap
data.qc1.seq$cor.matrix <- cor(data.qc1.seq$log2.tmm.cpm.gf, method="pearson")
plot_correlation_heatmap(data.qc1.seq$cor.matrix, data.qc1.seq$metadata, clust=TRUE)
Filter 2: Within-tissue correlation
Iterative removal of low correlation samples
data.qc2.cor <-
iterate_filter_samples_by_correlation(
data.qc1.seq$counts, data.qc1.seq$metadata, min.cpm=params$min.cpm, round=2)
## [1] "Filtering and normalizing counts..."
## [1] "Removing samples:"
## [1] "F04_pectoral_fin"
## [1] "Filtering and normalizing counts..."
## [1] "No samples to remove"
# Remove tissue-sex conditions with few replicates remaining
data.qc2.cor <-
filter_conditions_by_min_replicates(
data.qc2.cor$counts, data.qc2.cor$metadata, params$min.replicates)
## [1] "No conditions to remove"
data.qc2.cor$counts.gf <-
filter_genes_by_cpm_per_condition(
data.qc2.cor$counts, data.qc2.cor$metadata, min.cpm=params$min.cpm)
# Normalize across all conditions
data.qc2.cor$tmm.cpm.gf <-
transform_counts_to_tmm_cpm(data.qc2.cor$counts.gf, data.qc2.cor$metadata, log2=FALSE)
data.qc2.cor$log2.tmm.cpm.gf <-
transform_counts_to_tmm_cpm(data.qc2.cor$counts.gf, data.qc2.cor$metadata,
log2=TRUE, shift.min=TRUE)
Plot - PCA
data.qc2.cor$pca <- run_pca(data.qc2.cor$log2.tmm.cpm.gf)
By batch
plot_pca_by_batch(data.qc2.cor$pca, data.qc2.cor$metadata)
By condition
plot_pca_by_condition(data.qc2.cor$pca, data.qc2.cor$metadata)
plot_pca_by_condition_pc34(data.qc2.cor$pca, data.qc2.cor$metadata)
Plot - Correlation heatmap
data.qc2.cor$cor.matrix <- cor(data.qc2.cor$log2.tmm.cpm.gf, method="pearson")
plot_correlation_heatmap(data.qc2.cor$cor.matrix, data.qc2.cor$metadata, clust=TRUE)
Boxplot - Detected genes per organ
boxplot_detected_genes(data.qc2.cor$metadata)
Gene biotype
Split genes by biotype
data.qc2.cor$coding$log2.tmm.cpm.gf <-
data.qc2.cor$log2.tmm.cpm.gf[
rownames(data.qc2.cor$log2.tmm.cpm.gf)
%in% biotype$Gene.stable.ID[biotype$Gene.type=="protein_coding"],]
data.qc2.cor$lncrna$log2.tmm.cpm.gf <-
data.qc2.cor$log2.tmm.cpm.gf[
rownames(data.qc2.cor$log2.tmm.cpm.gf)
%in% biotype$Gene.stable.ID[biotype$Gene.type=="lncRNA" |
biotype$Gene.type=="lincRNA"],]
# Protein-coding
print(nrow(data.qc2.cor$coding$log2.tmm.cpm.gf))
## [1] 19287
# lncRNA
print(nrow(data.qc2.cor$lncrna$log2.tmm.cpm.gf))
## [1] 479
# Neither
nrow(data.qc2.cor$log2.tmm.cpm.gf) -
(nrow(data.qc2.cor$coding$log2.tmm.cpm.gf) + nrow(data.qc2.cor$lncrna$log2.tmm.cpm.gf))
## [1] 590
Protein-coding
Plot - PCA
data.qc2.cor$coding$pca <- run_pca(data.qc2.cor$coding$log2.tmm.cpm.gf)
By batch
plot_pca_by_batch(data.qc2.cor$coding$pca, data.qc2.cor$metadata)
By condition
plot_pca_by_condition(data.qc2.cor$coding$pca, data.qc2.cor$metadata)
plot_pca_by_condition_pc34(data.qc2.cor$coding$pca, data.qc2.cor$metadata)
Plot - Correlation heatmap
data.qc2.cor$coding$cor.matrix <-
cor(data.qc2.cor$coding$log2.tmm.cpm.gf, method="pearson")
plot_correlation_heatmap(data.qc2.cor$coding$cor.matrix,
data.qc2.cor$metadata, clust=TRUE)
lncRNA
Plot - PCA
data.qc2.cor$lncrna$pca <- run_pca(data.qc2.cor$lncrna$log2.tmm.cpm.gf)
By batch
plot_pca_by_batch(data.qc2.cor$lncrna$pca, data.qc2.cor$metadata)
By condition
plot_pca_by_condition(data.qc2.cor$lncrna$pca, data.qc2.cor$metadata)
plot_pca_by_condition_pc34(data.qc2.cor$lncrna$pca, data.qc2.cor$metadata)
Plot - Correlation heatmap
data.qc2.cor$lncrna$cor.matrix <-
cor(data.qc2.cor$lncrna$log2.tmm.cpm.gf, method="pearson")
plot_correlation_heatmap(data.qc2.cor$lncrna$cor.matrix,
data.qc2.cor$metadata, clust=TRUE)
Reduced (all nonzero) expression matrix
data.qc2.cor.nonzero <- data.qc2.cor[c("conditions","metadata","counts")]
# Only keep genes with no zero counts across all samples
# to set the same number of 'expressed genes' per condition
data.qc2.cor.nonzero$counts.gf <-
keep_full_nonzero_expression_matrix(data.qc2.cor.nonzero$counts)
# Normalize across all conditions
data.qc2.cor.nonzero$tmm.cpm.gf <-
transform_counts_to_tmm_cpm(data.qc2.cor.nonzero$counts.gf, data.qc2.cor.nonzero$metadata, log2=FALSE)
data.qc2.cor.nonzero$log2.tmm.cpm.gf <-
transform_counts_to_tmm_cpm(data.qc2.cor.nonzero$counts.gf, data.qc2.cor.nonzero$metadata,
log2=TRUE, shift.min=FALSE)
Plot - PCA
data.qc2.cor.nonzero$pca <- run_pca(data.qc2.cor.nonzero$log2.tmm.cpm.gf)
By batch
plot_pca_by_batch(data.qc2.cor.nonzero$pca, data.qc2.cor.nonzero$metadata)
By condition
plot_pca_by_condition(data.qc2.cor.nonzero$pca, data.qc2.cor.nonzero$metadata)
plot_pca_by_condition_pc34(data.qc2.cor.nonzero$pca, data.qc2.cor.nonzero$metadata)
Plot - Correlation heatmap
data.qc2.cor.nonzero$cor.matrix <- cor(data.qc2.cor.nonzero$log2.tmm.cpm.gf, method="pearson")
plot_correlation_heatmap(data.qc2.cor.nonzero$cor.matrix, data.qc2.cor.nonzero$metadata, clust=TRUE)
Normalization per condition
data.conditions <- vector(mode="list", length=6)
names(data.conditions) <-
c("conditions","metadata","counts","counts.gf","tmm.cpm.gf","log2.tmm.cpm.gf")
data.conditions$conditions <- data.qc2.cor$conditions
data.conditions$metadata <- data.qc2.cor$metadata
Tissue <- as.factor(data.qc2.cor$metadata$Tissue)
if ("Sex" %in% colnames(data.qc2.cor$metadata)) {
Sex <- as.factor(data.qc2.cor$metadata$Sex)
data.conditions$counts <-
lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
lapply(setNames(levels(Sex), levels(Sex)), function(s) {
subset_expr_matrix(data.qc2.cor$counts, tissue=t, sex=s)
})
})
data.conditions$counts.gf <-
lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
lapply(setNames(levels(Sex), levels(Sex)), function(s) {
filter_genes_by_cpm(data.conditions$counts[[t]][[s]],
min.cpm=params$min.cpm, tmm=FALSE)
})
})
data.conditions$tmm.cpm.gf <-
lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
lapply(setNames(levels(Sex), levels(Sex)), function(s) {
transform_counts_to_tmm_cpm(
data.conditions$counts.gf[[t]][[s]],
data.conditions$metadata[data.conditions$metadata$Tissue==t &
data.conditions$metadata$Sex==s,],
log2=FALSE)
})
})
data.conditions$log2.tmm.cpm.gf <-
lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
lapply(setNames(levels(Sex), levels(Sex)), function(s) {
transform_counts_to_tmm_cpm(
data.conditions$counts.gf[[t]][[s]],
data.conditions$metadata[data.conditions$metadata$Tissue==t &
data.conditions$metadata$Sex==s,],
log2=TRUE, shift.min=TRUE)
})
})
} else {
data.conditions$counts <-
lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
subset_expr_matrix(data.qc2.cor$counts, tissue=t)
})
data.conditions$counts.gf <-
lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
filter_genes_by_cpm(data.conditions$counts[[t]],
min.cpm=params$min.cpm, tmm=FALSE)
})
data.conditions$tmm.cpm.gf <-
lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
transform_counts_to_tmm_cpm(
data.conditions$counts.gf[[t]],
data.conditions$metadata[data.conditions$metadata$Tissue==t,],
log2=FALSE)
})
data.conditions$log2.tmm.cpm.gf <-
lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
transform_counts_to_tmm_cpm(
data.conditions$counts.gf[[t]],
data.conditions$metadata[data.conditions$metadata$Tissue==t,],
log2=TRUE, shift.min=TRUE)
})
}
Reduced (all nonzero) expression matrix
data.conditions.nonzero <- vector(mode="list", length=6)
names(data.conditions.nonzero) <-
c("conditions","metadata","counts","counts.gf","tmm.cpm.gf","log2.tmm.cpm.gf")
data.conditions.nonzero$conditions <- data.qc2.cor$conditions
data.conditions.nonzero$metadata <- data.qc2.cor$metadata
Tissue <- as.factor(data.qc2.cor$metadata$Tissue)
if ("Sex" %in% colnames(data.qc2.cor$metadata)) {
Sex <- as.factor(data.qc2.cor$metadata$Sex)
data.conditions.nonzero$counts <-
lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
lapply(setNames(levels(Sex), levels(Sex)), function(s) {
subset_expr_matrix(data.qc2.cor$counts, tissue=t, sex=s)
})
})
data.conditions.nonzero$counts.gf <-
lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
lapply(setNames(levels(Sex), levels(Sex)), function(s) {
data.conditions.nonzero$counts[[t]][[s]][rownames(data.qc2.cor.nonzero$counts.gf),]
})
})
data.conditions.nonzero$tmm.cpm.gf <-
lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
lapply(setNames(levels(Sex), levels(Sex)), function(s) {
transform_counts_to_tmm_cpm(
data.conditions.nonzero$counts.gf[[t]][[s]],
data.conditions.nonzero$metadata[data.conditions.nonzero$metadata$Tissue==t &
data.conditions.nonzero$metadata$Sex==s,],
log2=FALSE)
})
})
data.conditions.nonzero$log2.tmm.cpm.gf <-
lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
lapply(setNames(levels(Sex), levels(Sex)), function(s) {
transform_counts_to_tmm_cpm(
data.conditions.nonzero$counts.gf[[t]][[s]],
data.conditions.nonzero$metadata[data.conditions.nonzero$metadata$Tissue==t &
data.conditions.nonzero$metadata$Sex==s,],
log2=TRUE, shift.min=TRUE)
})
})
} else {
data.conditions.nonzero$counts <-
lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
subset_expr_matrix(data.qc2.cor$counts, tissue=t)
})
data.conditions.nonzero$counts.gf <-
lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
data.conditions.nonzero$counts[[t]][rownames(data.qc2.cor.nonzero$counts.gf),]
})
data.conditions.nonzero$tmm.cpm.gf <-
lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
transform_counts_to_tmm_cpm(
data.conditions.nonzero$counts.gf[[t]],
data.conditions.nonzero$metadata[data.conditions.nonzero$metadata$Tissue==t,],
log2=FALSE)
})
data.conditions.nonzero$log2.tmm.cpm.gf <-
lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
transform_counts_to_tmm_cpm(
data.conditions.nonzero$counts.gf[[t]],
data.conditions.nonzero$metadata[data.conditions.nonzero$metadata$Tissue==t,],
log2=TRUE, shift.min=TRUE)
})
}